import pdfplumber
import xlwt
from tqdm import tqdm
import jsonlines
import os
import pandas as pd
import traceback
import json
from glob import glob


def pdf2excel(path):
    with pdfplumber.open(path) as pdf:
        data = []
        for page in tqdm(pdf.pages):
            for rows in page.extract_tables():
                for i in range(len(rows)):
                    for j in range(len(rows[i])):
                        if bool(rows[i][j]) and '\n' in rows[i][j]:
                            rows[i][j] = rows[i][j].replace('\n', '')
                data.append(rows)

    excel_data = []
    # 首行处理合并表头

    excel_data.append(data[0][0])
    for page in data:
        for row in page:
            if row[0] in ['序号', '药品分类代码', "编码", "文件出处"] or row[0] == data[0][0][0]:
                continue
            excel_data.append(row)

    workbook = xlwt.Workbook(encoding='utf-8')
    worksheet = workbook.add_sheet('sheet1')
    for i, row in enumerate(excel_data):
        for j, cell in enumerate(row):
            worksheet.write(i, j, cell)
    workbook.save(path.replace('.pdf', '.xlsx'))


def jsonlines2excel(path):
    excel_data = []
    count = 0
    try:
        data = jsonlines.open(path, "r")
        for item in tqdm(data):
            count += 1
            excel_data.append(item)
    except:
        if count < 100:
            traceback.print_exc()
            with open(path, "r") as f:
                text = f.read()
            for item in text.split("\n"):
                if item:
                    data = json.loads(item)
                    excel_data.append(data)
    df = pd.DataFrame(excel_data)
    df = df.fillna("")
    df.to_excel(path.replace(os.path.basename(path).split(".")[-1], 'xlsx'), index=False)


if __name__ == '__main__':
    paths = [
        "/home/kv/Documents/WXWork/1688851095012743/Cache/File/2022-08/已解密/郑州市/河南省医疗服务价格项目规范.pdf",
    ]
    # paths = [
    #     "/home/kv/workspace/归一化库/7月29日/河南/郑州市/河南省肿瘤医院(郑州大学附属肿瘤医院)/河南省_郑州市_河南省肿瘤医院(郑州大学附属肿瘤医院)_医疗服务.json",
    # ]
    # paths = glob("/home/kv/workspace/归一化库/浙江/*/*/*.json")
    paths = glob("/kv-data/sw/查药扣费/数据/14-青岛市/青岛/*.json")
    for path in paths:
        # pdf2excel(path)
        jsonlines2excel(path)
